Repeated DNA Sequences

All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: “ACGAATTCCG”. When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.

Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.

For example,

  1. Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT",
  2. Return:
  3. ["AAAAACCCCC", "CCCCCAAAAA"].

Solution:

  1. public class Solution {
  2. public List<String> findRepeatedDnaSequences(String s) {
  3. int len = s.length();
  4. if (len < 10) {
  5. return new LinkedList<String>();
  6. }
  7. int a = 0;
  8. int m = 1;
  9. for (int i = 0; i < 10; i++) {
  10. a = a * 4 + getCode(s.charAt(i));
  11. m *= 4;
  12. }
  13. HashSet<Integer> set = new HashSet<Integer>();
  14. HashSet<Integer> r = new HashSet<Integer>();
  15. LinkedList<String> result = new LinkedList<String>();
  16. set.add(a);
  17. int p = 10;
  18. while (p < len) {
  19. a = (a * 4 + getCode(s.charAt(p))) % m;
  20. if (set.contains(a) && !r.contains(a)) {
  21. result.add(s.substring(p - 9, p + 1));
  22. r.add(a);
  23. }
  24. else {
  25. set.add(a);
  26. }
  27. p++;
  28. }
  29. return result;
  30. }
  31. private int getCode(char c) {
  32. if (c == 'C') return 1;
  33. if (c == 'G') return 2;
  34. if (c == 'T') return 3;
  35. return 0;
  36. }
  37. }